1 Load packages

#install.packages("softmaxreg")
#install.packages("DMwR")
#install.packages("esquisse")
#install.packages("inspectdf")
#install.packages("DataExplorer")

# Load packages
library(tidyverse)
library(ggplot2)
library(GGally)
library(stringr)
library(gridExtra)
library(scales)
library(ggthemes)
library(gghighlight)
library(janitor)
library(MultinomialCI)
library(htmlTable)
library(formattable)
library(DMwR)
library(esquisse)
library(inspectdf)

# Stop scientific notation
options(scipen = 999)

2 Describe project

Airbnb, Inc. is an online marketplace for arranging or offering lodging, primarily homestays, or tourism experiences. The company does not own any of the real estate listings, nor does it host events; it acts as a broker, receiving commissions from each booking. In this project, I would like to explore Seattle Airbnb Market, and provide some recommendations to Airbnb hosts and person who wants to host home in the future.

I will analyze the Seattle Airbnb Market data and determinging key findings, they are:

  • Be a superhost

    • Airbnb of superhost is more popular.
    • even though the listing price of superhost is little lower than non-superhost, but the revenue of superhost is higher across different performance levels.
    • For most of the cancellation policy, superhost can earn more median revenue than non-superhost.
    • Superhost can earn more median revenue per year across each neighbourhood group.
  • Provide resonable price

    • most of the booking prices are under $250
  • Make your Airbnb home instant bookable

    • instant bookable price distrbution is higher than non-instant bookable Airbnbs.
    • In each neighbourhood group, instant bookable Airbnb can get higher median revenus.
  • Host the home with reasonable accommodates

    • most of the bookings are less than 5 accommodates.
  • Response your customer’s questions

    • Response rate of most Good and Excellent performance hosts are high.
  • Make flexible or moderate cancellation policies

    • flexible and moderate cancellation policies have relatively higher median annual revenue.
  • choose the right neighbourhood group

    • Downtown, Cascade and Queen Anne have higher median price distributions.

3 Read in data and prepare for analysis

# Load data
sea_airbnb <-read.csv("listings.csv")
# clean data
# look at the data
str(sea_airbnb)
summary(sea_airbnb)

# change column names
names(sea_airbnb)[9] <- "neighbourhood"
names(sea_airbnb)[10] <- "neighbourhood_group"

# delete NAs and unused levels
sea_airbnb <- sea_airbnb[complete.cases(sea_airbnb), ]
sea_airbnb <- sea_airbnb[!(sea_airbnb$host_response_rate=="N/A"),]
sea_airbnb$host_response_time <- droplevels(sea_airbnb$host_response_time)
sea_airbnb$host_is_superhost <- droplevels(sea_airbnb$host_is_superhost)
sea_airbnb$host_response_rate <- droplevels(sea_airbnb$host_response_rate)

# extract year only from host_since variable
sea_airbnb$host_since <- strptime(as.character(sea_airbnb$host_since), "%m/%d/%y")
sea_airbnb$host_since <- substring(sea_airbnb$host_since, 1, 4)
sea_airbnb$host_since <- as.numeric(as.character(sea_airbnb$host_since))

# convert factor variables to numerical variables
sea_airbnb$host_response_rate <- as.numeric(sub("%", "", sea_airbnb$host_response_rate,fixed=TRUE))/100
sea_airbnb$price <- as.numeric(sub("$", "", sea_airbnb$price,fixed=TRUE))

# clean NA in price variable
sea_airbnb <- sea_airbnb[complete.cases(sea_airbnb), ]
  • There are 9040 obeservations and 25 variables before cleaning data.

4 Base EDA Step 1: Univariate non-graphical analysis

# look at the data after cleaning
head(sea_airbnb)
##     id host_id host_name host_since host_response_time host_response_rate
## 1 2318    2536     Megan       2008     within an hour                  1
## 3 6606   14942     Joyce       2009 within a few hours                  1
## 4 9419   30559 Angielena       2009 within a few hours                  1
## 5 9460   30832     Siena       2009     within an hour                  1
## 6 9531   31481    Cassie       2009 within a few hours                  1
## 7 9534   31481    Cassie       2009 within a few hours                  1
##   host_is_superhost host_total_listings_count  neighbourhood
## 1                 t                         2        Madrona
## 3                 f                         5    Wallingford
## 4                 t                         8     Georgetown
## 5                 t                         4     First Hill
## 6                 t                         2 Fairmount Park
## 7                 t                         2 Fairmount Park
##   neighbourhood_group zipcode latitude longitude property_type
## 1        Central Area   98122 47.61082 -122.2908         House
## 3 Other neighborhoods   98103 47.65411 -122.3376    Guesthouse
## 4 Other neighborhoods   98108 47.55062 -122.3201     Apartment
## 5            Downtown   98101 47.61265 -122.3294   Condominium
## 6        West Seattle   98136 47.55539 -122.3847         House
## 7        West Seattle   98136 47.55624 -122.3860   Guest suite
##         room_type accommodates bathrooms bedrooms beds price
## 1 Entire home/apt            9       2.5        4    4   296
## 3 Entire home/apt            2       1.0        1    1    90
## 4    Private room            2       3.0        1    1    62
## 5    Private room            2       1.0        1    1    99
## 6 Entire home/apt            4       1.0        2    3   165
## 7 Entire home/apt            3       1.0        2    2   125
##   number_of_reviews review_scores_rating instant_bookable
## 1                28                  100                f
## 3               147                   92                f
## 4               144                   93                f
## 5               443                   98                t
## 6                37                  100                f
## 7                44                  100                f
##           cancellation_policy reviews_per_month
## 1 strict_14_with_grace_period              0.21
## 3 strict_14_with_grace_period              1.19
## 4                    moderate              1.29
## 5                    moderate              3.62
## 6 strict_14_with_grace_period              0.39
## 7 strict_14_with_grace_period              0.47
  • Data appears tidy and ready for analysis

    • data in each column is of the same variable type
    • no duplicate columns
    • each row is an observation of an unique listing
# add weighted review score column
#sea_airbnb$year_weight <- SoftMax(max(sea_airbnb$host_since)-sea_airbnb$host_since)
sea_airbnb$year_weight <- SoftMax(sea_airbnb$host_since)
sea_airbnb$review_weight <- log(sea_airbnb$number_of_reviews)
sea_airbnb$weighted_score <- sea_airbnb$review_scores_rating*sea_airbnb$year_weight*sea_airbnb$review_weight

# convert numeric weighted review score data to categorical performance
sea_airbnb$performance[sea_airbnb$weighted_score >= 0 & sea_airbnb$weighted_score <=  20]  = "Bad"
sea_airbnb$performance[sea_airbnb$weighted_score > 20 & sea_airbnb$weighted_score <=  120]  = "Poor"
sea_airbnb$performance[sea_airbnb$weighted_score > 120 & sea_airbnb$weighted_score <=  300]  = "Fair"
sea_airbnb$performance[sea_airbnb$weighted_score > 300 & sea_airbnb$weighted_score <=  400]  = "Good"
sea_airbnb$performance[sea_airbnb$weighted_score > 400]  = "Excellent"
sea_airbnb$performance = factor(sea_airbnb$performance, levels=c("Bad", "Poor", "Fair", "Good", "Excellent"))

# change level names for variables
levels(sea_airbnb$host_is_superhost) <- c("No", "Yes")
levels(sea_airbnb$instant_bookable) <- c("No", "Yes")

# adjusted columns for analysis use
sea_airbnb <- select(sea_airbnb, -year_weight, -review_weight)

# look at the data
summary(sea_airbnb)
##        id              host_id                       host_name   
##  Min.   :    2318   Min.   :      862   Corp Condos & Apts: 233  
##  1st Qu.:12908346   1st Qu.:  7911180   Day 1             : 114  
##  Median :21357272   Median : 26967583   Addison           :  92  
##  Mean   :20375502   Mean   : 57739200   Loftium           :  84  
##  3rd Qu.:28635671   3rd Qu.: 82961680   Dario             :  82  
##  Max.   :38649181   Max.   :293180955   Melissa           :  73  
##                                         (Other)           :5623  
##    host_since            host_response_time host_response_rate
##  Min.   :2008   a few days or more:  19     Min.   :0.0000    
##  1st Qu.:2013   within a day      : 248     1st Qu.:1.0000    
##  Median :2015   within a few hours: 687     Median :1.0000    
##  Mean   :2015   within an hour    :5347     Mean   :0.9829    
##  3rd Qu.:2016                               3rd Qu.:1.0000    
##  Max.   :2019                               Max.   :1.0000    
##                                                               
##  host_is_superhost host_total_listings_count     neighbourhood 
##  No :2994          Min.   :   0.00           Broadway   : 372  
##  Yes:3307          1st Qu.:   1.00           Belltown   : 358  
##                    Median :   2.00           Wallingford: 242  
##                    Mean   :  83.92           First Hill : 238  
##                    3rd Qu.:   9.00           Minor      : 235  
##                    Max.   :1795.00           Fremont    : 202  
##                                              (Other)    :4654  
##           neighbourhood_group    zipcode        latitude    
##  Other neighborhoods:1195     98122  : 651   Min.   :47.50  
##  Downtown           :1091     98103  : 591   1st Qu.:47.60  
##  Capitol Hill       : 639     98101  : 471   Median :47.62  
##  Central Area       : 574     98144  : 342   Mean   :47.62  
##  Queen Anne         : 449     98121  : 331   3rd Qu.:47.66  
##  West Seattle       : 362     98109  : 328   Max.   :47.74  
##  (Other)            :1991     (Other):3587                  
##    longitude          property_type            room_type   
##  Min.   :-122.4   Apartment  :1985   Entire home/apt:4833  
##  1st Qu.:-122.4   House      :1953   Hotel room     : 111  
##  Median :-122.3   Guest suite: 802   Private room   :1276  
##  Mean   :-122.3   Townhouse  : 490   Shared room    :  81  
##  3rd Qu.:-122.3   Condominium: 384                         
##  Max.   :-122.2   Guesthouse : 223                         
##                   (Other)    : 464                         
##   accommodates      bathrooms         bedrooms          beds       
##  Min.   : 1.000   Min.   : 0.000   Min.   :0.000   Min.   : 0.000  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.:1.000   1st Qu.: 1.000  
##  Median : 3.000   Median : 1.000   Median :1.000   Median : 1.000  
##  Mean   : 3.771   Mean   : 1.299   Mean   :1.372   Mean   : 1.944  
##  3rd Qu.: 4.000   3rd Qu.: 1.000   3rd Qu.:2.000   3rd Qu.: 2.000  
##  Max.   :28.000   Max.   :16.000   Max.   :8.000   Max.   :49.000  
##                                                                    
##      price       number_of_reviews review_scores_rating instant_bookable
##  Min.   :  0.0   Min.   :  1.00    Min.   : 20.00       No :2876        
##  1st Qu.: 80.0   1st Qu.: 10.00    1st Qu.: 94.00       Yes:3425        
##  Median :119.0   Median : 35.00    Median : 97.00                       
##  Mean   :156.8   Mean   : 65.17    Mean   : 95.15                       
##  3rd Qu.:180.0   3rd Qu.: 90.00    3rd Qu.: 99.00                       
##  Max.   :999.0   Max.   :767.00    Max.   :100.00                       
##                                                                         
##                   cancellation_policy reviews_per_month weighted_score  
##  flexible                   :1093     Min.   : 0.010    Min.   :  0.00  
##  moderate                   :2477     1st Qu.: 0.840    1st Qu.: 15.71  
##  strict                     : 267     Median : 2.200    Median :124.77  
##  strict_14_with_grace_period:2398     Mean   : 2.771    Mean   :156.44  
##  super_strict_30            :  51     3rd Qu.: 4.160    3rd Qu.:280.45  
##  super_strict_60            :  15     Max.   :14.870    Max.   :562.52  
##                                                                         
##     performance  
##  Bad      :1645  
##  Poor     :1442  
##  Fair     :1869  
##  Good     : 940  
##  Excellent: 405  
##                  
## 
  • New variables explanation:

    • The number of reviews may be affected by host year. Some Airbnbs have many reviews may because their host year is long, so I use year weighted to reduce the number of reviews affected by year. The longer the host year, the lower weight will be given. Then, the review score will be lower affected by host years.
    • The review score is affected by number of reviews, so I calculated the logarithm value for number of reviews as review weight to avoid extreme high review scores.
    • Then I got the weighted score of reviews.
  • Data observations

    • Many hosts provide several airbnb listings.
    • Most of the hosts response within an hour.
    • The host response rate looks unsymmetric.
    • More than half of the hosts are superhost.
    • Host total listings look unsymmetric.
    • Most of the property types are apartment and house.
    • Most of the room types are entire home/apt and private room.
    • Accommodates look unsymmetric.
    • Bathrooms, bedrooms, and beds look unsymmetric.
    • Price look unsymmetric.
    • Number of reviews look unsymmetric.
    • Review scores rating look unsymmetric.
    • More than half of the airbnbs are instant bookable.
    • Reviews per month look unsymmetric.
    • Weighted score looks unsymmetric.
  • Questions on data

    • Which neighbourhood that Airbnb located has the highest price?
    • Which property type of Airbnb has the highest price?
    • What factors have affect on price?
    • What factors have affect on weighted review scores?
    • Which property type is most popular?
    • What are the features of Airbnb with high review scores?
# Examine data structure after cleaning
str(sea_airbnb)
## 'data.frame':    6301 obs. of  27 variables:
##  $ id                       : int  2318 6606 9419 9460 9531 9534 9596 9909 11012 14386 ...
##  $ host_id                  : int  2536 14942 30559 30832 31481 31481 14942 33360 14942 39377 ...
##  $ host_name                : Factor w/ 2449 levels "","'Keia","(Email hidden by Airbnb)",..: 1583 1109 140 2081 400 400 1109 1325 1109 286 ...
##  $ host_since               : num  2008 2009 2009 2009 2009 ...
##  $ host_response_time       : Factor w/ 4 levels "a few days or more",..: 4 3 3 4 3 3 3 4 3 2 ...
##  $ host_response_rate       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ host_is_superhost        : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 1 2 1 1 ...
##  $ host_total_listings_count: int  2 5 8 4 2 2 5 8 5 3 ...
##  $ neighbourhood            : Factor w/ 89 levels "Adams","Alki",..: 43 82 25 21 19 19 82 19 82 26 ...
##  $ neighbourhood_group      : Factor w/ 17 levels "Ballard","Beacon Hill",..: 5 12 12 7 17 17 12 17 12 12 ...
##  $ zipcode                  : Factor w/ 34 levels ""," ","90105",..: 22 7 13 5 27 27 7 24 7 7 ...
##  $ latitude                 : num  47.6 47.7 47.6 47.6 47.6 ...
##  $ longitude                : num  -122 -122 -122 -122 -122 ...
##  $ property_type            : Factor w/ 29 levels "Apartment","Bed and breakfast",..: 18 15 1 10 18 14 1 18 18 18 ...
##  $ room_type                : Factor w/ 4 levels "Entire home/apt",..: 1 1 3 3 1 1 1 1 1 3 ...
##  $ accommodates             : int  9 2 2 2 4 3 4 8 8 2 ...
##  $ bathrooms                : num  2.5 1 3 1 1 1 1 2 2 1.5 ...
##  $ bedrooms                 : int  4 1 1 1 2 2 1 3 3 1 ...
##  $ beds                     : int  4 1 1 1 3 2 4 5 3 1 ...
##  $ price                    : num  296 90 62 99 165 125 120 125 299 40 ...
##  $ number_of_reviews        : int  28 147 144 443 37 44 91 71 91 141 ...
##  $ review_scores_rating     : int  100 92 93 98 100 100 91 96 96 92 ...
##  $ instant_bookable         : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 2 1 1 ...
##  $ cancellation_policy      : Factor w/ 6 levels "flexible","moderate",..: 4 4 2 2 4 4 4 4 4 4 ...
##  $ reviews_per_month        : num  0.21 1.19 1.29 3.62 0.39 0.47 0.9 0.59 0.78 1.27 ...
##  $ weighted_score           : num  0.0299 0.1707 0.1718 0.222 0.1342 ...
##  $ performance              : Factor w/ 5 levels "Bad","Poor","Fair",..: 1 1 1 1 1 1 1 1 1 1 ...
  • 6301 observations of 27 variables
  • 9 integer and 8 numeric variables
  • 10 factor variables

5 Base EDA Step 2: Univariate graphical analysis

Now, I will explore some important variable individually.

5.1 Categorical/Factor variables

I selected host_response_time, host_is_superhost, neighbourhood, neighbourhood_group, zipcode, property_type, room_type, instant_bookable, cancellation_policy, and performance to examine individually.

5.1.1 host_response_time and host_is_superhost

grid.arrange(
  
  ggplot(data = sea_airbnb, mapping = aes(x = host_response_time)) +
    geom_bar(),
  
  ggplot(data = sea_airbnb, mapping = aes(x = host_is_superhost)) +
    geom_bar(),
  
  ncol = 1

)

Comments

- Most of the hosts response within an hour.
- Superhost is a little bit more than non-superhost.

5.1.2 neighbourhood and neighbourhood_group

grid.arrange(
  
  ggplot(data = sea_airbnb, mapping = aes(x = neighbourhood)) +
    geom_bar() +
    theme(axis.text.x = element_text(size = 5, angle = 90)),
  
  ggplot(data = sea_airbnb, mapping = aes(x = neighbourhood_group)) +
    geom_bar() +
    theme(axis.text.x = element_text(size = 8, angle = 20)),
  
  ncol = 1
  
)

Comments

- For neighbourhood, Belltown Broadway have the most number of Airbnbs.
- For grouped neighbourhood, Downtown has the most number of Airbnbs (besides Other neighbourhoods).

5.1.3 zipcode

ggplot(data = sea_airbnb, mapping = aes(x = zipcode)) +
  geom_bar() +
  theme(axis.text.x = element_text(size = 5, angle = 30))

Comments

- 98101, 98103, and 98122 have the most number of Airbnbs

5.1.4 property_type and room_type

grid.arrange(
  
  ggplot(data = sea_airbnb, mapping = aes(x = property_type)) +
    geom_bar() +
    theme(axis.text.x = element_text(size = 5, angle = 30)),
  
  ggplot(data = sea_airbnb, mapping = aes(x = room_type)) +
    geom_bar(),
  
  ncol = 1
  
)

Comments

- Most of the Airbnb property types in Seattle are apartment and house.
- Most of the Airbnb room type in Seattle is entire home/apt.

5.1.5 instant_bookable and cancellation_policy

grid.arrange(
  
  ggplot(data = sea_airbnb, mapping = aes(x = instant_bookable)) +
    geom_bar(),
  
  ggplot(data = sea_airbnb, mapping = aes(x = cancellation_policy)) +
    geom_bar() +
    theme(axis.text.x = element_text(size = 10, angle = 10)),
  
  ncol = 1
  
)

Comments

- The instant bookable Airbnb is more than non-instant bookable Airbnb.
- Most of the Airbnbs have flexible, moderate or strict 14 with grace period cancellation policy.

5.1.6 performance

ggplot(data = sea_airbnb, mapping = aes(x = performance)) +
    geom_bar()

Comments

  • Performance variable is an overall ranking based on review numbers, host years, and review scores (i.e: weighted score).

  • We can find that nearly half of the Airbnb in Seattle don’t have a very good performance.

    • Even though it seems that some of the Airbnbs have high review scores, but it might because there are only a few of number of reviews which make these reviews less convincing.
    • The Airbnb of good and excellent performance must have many high score reviews no matter the years of host since.

5.2 Quantitative variables

I will examine on:

- host_total_listings_count
- host_response_rate
- accommodates
- bedrooms
- price
- number_of_reviews
- reviews_per_month
- weighted_score
grid.arrange(
  
  ggplot(data = sea_airbnb, mapping = aes(x = host_total_listings_count)) +
    geom_histogram(),
  
  ggplot(data = sea_airbnb, mapping = aes(x = 1, y = host_total_listings_count)) +
    geom_boxplot() +
    coord_flip(),
  
  ncol = 1

)

Comments

  • Most of the Airbnb hosts have only a few listings

5.2.1 host_response_rate

grid.arrange(
  
  ggplot(data = sea_airbnb, mapping = aes(x = host_response_rate)) +
    geom_histogram(),
  
  ggplot(data = sea_airbnb, mapping = aes(x = 1, y = host_response_rate)) +
    geom_boxplot() +
    coord_flip(),
  
  ncol = 1

)

Comments

  • The distribution of host_response_rate is skewed

    • Median would be better than mean.
    • Most of the hosts have more than 85% response rate.
    • There are some outliers less than 1.
    • The outliers may affect the review socres, so I cannot remove them.

5.2.2 accommodates

grid.arrange(
  
  ggplot(data = sea_airbnb, mapping = aes(x = accommodates)) +
    geom_bar(),
  
  ggplot(data = sea_airbnb, mapping = aes(x = 1, y = accommodates)) +
    geom_boxplot() +
    coord_flip(),
  
  ncol = 1
  
)

Comments

  • Confirm the accommodates is a skewed distribution

    • Median would be better than mean.
    • Most of the accommodates are under 8.
    • Most of the accommodates are cluttered around 5.

5.2.3 bedrooms

grid.arrange(
  
  ggplot(data = sea_airbnb, mapping = aes(x = bedrooms)) +
    geom_bar(),
  
  ggplot(data = sea_airbnb, mapping = aes(x = 1, y = bedrooms)) +
    geom_boxplot() +
    coord_flip(),
  
  ncol = 1
  
)

Comments

  • Confirms the bedrooms variable is a skewed distribution

    • Median would be better than mean.
    • Most of the bedrooms number is under 3.
    • There are five outliers, and the maximum number of bedrooms is 8.
    • One bedroom is most common.

5.2.4 price

grid.arrange(
  
  ggplot(data = sea_airbnb, mapping = aes(x = price)) +
    geom_bar(),
  
  ggplot(data = sea_airbnb, mapping = aes(x = 1, y = price)) +
    geom_boxplot() +
    coord_flip(),
  
  ncol = 1
  
)

Comments

  • The distribution is unsymmetric

    • Median would be better than mean.
    • Most of the Airbnb room prices are less than 250 dollars per night.
    • There are several outliers, but they are important for my reasearch questions so I won’t remove them.

5.2.5 number_of_reviews

grid.arrange(
  
  ggplot(data = sea_airbnb, mapping = aes(x = number_of_reviews)) +
    geom_bar(),
  
  ggplot(data = sea_airbnb, mapping = aes(x = 1, y = number_of_reviews)) +
    geom_boxplot() +
    coord_flip(),
  
  ncol = 1
  
)

5.2.6 reviews_per_month

grid.arrange(
  
  ggplot(data = sea_airbnb, mapping = aes(x = reviews_per_month)) +
    geom_bar(),
  
  ggplot(data = sea_airbnb, mapping = aes(x = 1, y = reviews_per_month)) +
    geom_boxplot() +
    coord_flip(),
  
  ncol = 1
  
)

Comments

  • The distribution is unsymmetric

    • Median would be better than mean.
    • Most of the monthly reviews are under 5.

5.2.7 review_scores_rating

grid.arrange(
  
  ggplot(data = sea_airbnb, mapping = aes(x = review_scores_rating)) +
    geom_bar(),
  
  ggplot(data = sea_airbnb, mapping = aes(x = 1, y = review_scores_rating)) +
    geom_boxplot() +
    coord_flip(),
  
  ncol = 1
  
)

Comments

  • Confirms the review scores rating is skewed

    • Median would be better than mean.
    • Most of the review scores is more than 90.
    • The outliers are on the left side, and the minimum review score is 20.

Questions

  • Do these scores objectively reflect the rating based on different number of reviews?

    • Take a look at the weighted review scores across number of reviews.

6 Base EDA Step 3: Multi-variate Non-graphical Analysis

6.1 Quantitative Only

# Correlation table
sea_airbnb %>%
  select_if(is.numeric) %>%
  select(-id, -host_id, -host_since, -latitude, -longitude) %>%
  cor() %>%
  round(2)
##                           host_response_rate host_total_listings_count
## host_response_rate                      1.00                      0.01
## host_total_listings_count               0.01                      1.00
## accommodates                            0.01                     -0.02
## bathrooms                               0.01                     -0.01
## bedrooms                                0.00                     -0.05
## beds                                    0.01                     -0.06
## price                                   0.01                      0.47
## number_of_reviews                       0.07                     -0.19
## review_scores_rating                    0.09                     -0.23
## reviews_per_month                       0.11                     -0.23
## weighted_score                          0.08                     -0.23
##                           accommodates bathrooms bedrooms  beds price
## host_response_rate                0.01      0.01     0.00  0.01  0.01
## host_total_listings_count        -0.02     -0.01    -0.05 -0.06  0.47
## accommodates                      1.00      0.52     0.80  0.82  0.46
## bathrooms                         0.52      1.00     0.58  0.47  0.32
## bedrooms                          0.80      0.58     1.00  0.71  0.40
## beds                              0.82      0.47     0.71  1.00  0.37
## price                             0.46      0.32     0.40  0.37  1.00
## number_of_reviews                -0.07     -0.10    -0.12 -0.07 -0.17
## review_scores_rating              0.03      0.02     0.07  0.03 -0.09
## reviews_per_month                -0.02     -0.14    -0.15 -0.05 -0.21
## weighted_score                    0.04     -0.03    -0.01  0.02 -0.13
##                           number_of_reviews review_scores_rating
## host_response_rate                     0.07                 0.09
## host_total_listings_count             -0.19                -0.23
## accommodates                          -0.07                 0.03
## bathrooms                             -0.10                 0.02
## bedrooms                              -0.12                 0.07
## beds                                  -0.07                 0.03
## price                                 -0.17                -0.09
## number_of_reviews                      1.00                 0.14
## review_scores_rating                   0.14                 1.00
## reviews_per_month                      0.57                 0.15
## weighted_score                         0.21                 0.15
##                           reviews_per_month weighted_score
## host_response_rate                     0.11           0.08
## host_total_listings_count             -0.23          -0.23
## accommodates                          -0.02           0.04
## bathrooms                             -0.14          -0.03
## bedrooms                              -0.15          -0.01
## beds                                  -0.05           0.02
## price                                 -0.21          -0.13
## number_of_reviews                      0.57           0.21
## review_scores_rating                   0.15           0.15
## reviews_per_month                      1.00           0.44
## weighted_score                         0.44           1.00

Comments

  • host_response_rate has some correlation with reviews_per_month, review_scores_rating, and weighted_score.
  • host_total_listings_count is positive correlated with price, and negative correlated with weighted_score.
  • accommodates has positive correlation with price.
  • number of reviews has some negative correlations with price.

    • infer that Airbnb with lower price is more popular.
  • weighted_score has some negative correlations with price.

    • infer that with the price become higher, people may be more strict to the experience in Airbnb.

Questions

  • Why price and host_total_listings_count are apparently positive correlated? Is this because most of the listings are located in some specific neighbourhoods?
  • Why weighted_score is negative correlated with price?
  • Bedrooms and accommodates are sightly negative correlated with each other, does that mean people prefer less bedrooms and less accommodates Airbnb?
  • Is there any other factors affect price and weighted_score?

6.2 Categorical Only

  • For categorical data, I’ll take a look at the relationships across different categorical variables:

    • host_response_time
    • host_is_superhost
    • neighbourhood_group
    • property_type
    • room_type
    • instant_bookable
    • cancellation_policy
    • performance

6.2.1 host_response_time and host_is_superhost

sea_airbnb %>%
  tabyl(host_response_time, host_is_superhost) %>%
  adorn_totals(where = c("row", "col")) %>%
  adorn_percentages(denominator = "col") %>%
  adorn_rounding(2)
##  host_response_time   No  Yes Total
##  a few days or more 0.01 0.00  0.00
##        within a day 0.06 0.02  0.04
##  within a few hours 0.11 0.11  0.11
##      within an hour 0.83 0.86  0.85
##               Total 1.00 1.00  1.00

Comments

  • Superhost and non-superhost have the similar distributions of percentages on host response time.

  • Indicates we can add one more quantitative variables (i.e: price) to take a further look

    • price may different in same area between superhost and non-superhost

6.2.2 host_is_superhost and neighbourhood_group

sea_airbnb %>%
  tabyl(host_is_superhost, neighbourhood_group) %>%
  adorn_totals(where = c("row", "col")) %>%
  adorn_percentages(denominator = "all") %>%
  adorn_rounding(2)
##  host_is_superhost Ballard Beacon Hill Capitol Hill Cascade Central Area
##                 No    0.02        0.02         0.05    0.02         0.03
##                Yes    0.03        0.02         0.05    0.02         0.06
##              Total    0.05        0.04         0.10    0.04         0.09
##  Delridge Downtown Interbay Lake City Magnolia Northgate
##      0.01     0.12        0      0.01     0.01      0.02
##      0.02     0.05        0      0.01     0.01      0.01
##      0.03     0.17        0      0.02     0.02      0.03
##  Other neighborhoods Queen Anne Rainier Valley Seward Park
##                 0.08       0.03           0.02        0.00
##                 0.11       0.04           0.03        0.01
##                 0.19       0.07           0.05        0.01
##  University District West Seattle Total
##                 0.02         0.02  0.48
##                 0.01         0.03  0.52
##                 0.03         0.06  1.00

Comments

  • Most of the superhost are located in Capitol Hill, Cental Area, Downtown Seattle, and Queen Anne

6.2.3 host_is_superhost and instant_bookable

sea_airbnb %>%
  tabyl(instant_bookable, host_is_superhost) %>%
  adorn_totals(where = c("row", "col")) %>%
  adorn_percentages(denominator = "all") %>%
  adorn_rounding(2)
##  instant_bookable   No  Yes Total
##                No 0.19 0.27  0.46
##               Yes 0.29 0.26  0.54
##             Total 0.48 0.52  1.00

Comments

  • For superhost, there is no apparent proportional difference between instant bookable and non-instant bookable.

6.2.4 host_is_superhost and cancellation_policy

sea_airbnb %>%
  tabyl(host_is_superhost, cancellation_policy) %>%
  adorn_totals(where = c("row", "col")) %>%
  adorn_percentages(denominator = "all") %>%
  adorn_rounding(3)
##  host_is_superhost flexible moderate strict strict_14_with_grace_period
##                 No    0.072    0.147  0.037                       0.210
##                Yes    0.102    0.246  0.005                       0.171
##              Total    0.173    0.393  0.042                       0.381
##  super_strict_30 super_strict_60 Total
##            0.008           0.002 0.475
##            0.000           0.000 0.525
##            0.008           0.002 1.000

Comments

  • Most of the superhost do not have very strict cancellation policy.

    • Cancellation policy may affect the review scores.
    • Infer that superhost may have a higher review score.
  • Need to combine quantitative variables to take a look

6.2.5 neighbourhood_group and room_type

sea_airbnb %>%
  tabyl(neighbourhood_group, room_type) %>%
  adorn_totals(where = c("row", "col")) %>%
  adorn_percentages(denominator = "all") %>%
  adorn_rounding(3)
##  neighbourhood_group Entire home/apt Hotel room Private room Shared room
##              Ballard           0.044      0.000        0.009       0.000
##          Beacon Hill           0.025      0.000        0.013       0.000
##         Capitol Hill           0.077      0.001        0.019       0.004
##              Cascade           0.023      0.011        0.003       0.000
##         Central Area           0.069      0.000        0.022       0.000
##             Delridge           0.019      0.000        0.013       0.000
##             Downtown           0.162      0.006        0.004       0.001
##             Interbay           0.002      0.000        0.000       0.000
##            Lake City           0.011      0.000        0.007       0.000
##             Magnolia           0.014      0.000        0.004       0.000
##            Northgate           0.015      0.000        0.011       0.000
##  Other neighborhoods           0.135      0.000        0.051       0.004
##           Queen Anne           0.059      0.000        0.012       0.000
##       Rainier Valley           0.037      0.000        0.017       0.000
##          Seward Park           0.009      0.000        0.003       0.000
##  University District           0.018      0.000        0.005       0.003
##         West Seattle           0.049      0.000        0.009       0.000
##                Total           0.767      0.018        0.203       0.013
##  Total
##  0.053
##  0.037
##  0.101
##  0.037
##  0.091
##  0.033
##  0.173
##  0.002
##  0.018
##  0.018
##  0.025
##  0.190
##  0.071
##  0.055
##  0.012
##  0.027
##  0.057
##  1.000

6.2.6 neighbourhood_group and instant_bookable

sea_airbnb %>%
  tabyl(neighbourhood_group, instant_bookable) %>%
  adorn_totals(where = c("row", "col")) %>%
  adorn_percentages(denominator = "all") %>%
  adorn_rounding(3)
##  neighbourhood_group    No   Yes Total
##              Ballard 0.027 0.026 0.053
##          Beacon Hill 0.017 0.020 0.037
##         Capitol Hill 0.046 0.055 0.101
##              Cascade 0.011 0.026 0.037
##         Central Area 0.043 0.049 0.091
##             Delridge 0.017 0.015 0.033
##             Downtown 0.043 0.130 0.173
##             Interbay 0.001 0.001 0.002
##            Lake City 0.011 0.008 0.018
##             Magnolia 0.010 0.008 0.018
##            Northgate 0.013 0.012 0.025
##  Other neighborhoods 0.106 0.084 0.190
##           Queen Anne 0.032 0.039 0.071
##       Rainier Valley 0.028 0.026 0.055
##          Seward Park 0.007 0.004 0.012
##  University District 0.015 0.011 0.027
##         West Seattle 0.030 0.028 0.057
##                Total 0.456 0.544 1.000

Comments

  • Beacon Hill, Capitol Hill, Cascade, Central Area, Downtown, and Queen Anne have more instant bookable Airbnb, especially for Downtown Seattle.

Questions

  • Does instant bookable Airbnb have a higher booking price?

6.2.7 instant_bookable and cancellation_policy

sea_airbnb %>%
  tabyl(cancellation_policy, instant_bookable) %>%
  adorn_totals(where = c("row", "col")) %>%
  adorn_percentages(denominator = "all") %>%
  adorn_rounding(3)
##          cancellation_policy    No   Yes Total
##                     flexible 0.070 0.103 0.173
##                     moderate 0.197 0.196 0.393
##                       strict 0.005 0.037 0.042
##  strict_14_with_grace_period 0.184 0.197 0.381
##              super_strict_30 0.000 0.008 0.008
##              super_strict_60 0.000 0.002 0.002
##                        Total 0.456 0.544 1.000

6.2.8 host_is_superhost and performance

sea_airbnb %>%
  tabyl(performance, host_is_superhost) %>%
  adorn_totals(where = c("row", "col"))
##  performance   No  Yes Total
##          Bad  885  760  1645
##         Poor  791  651  1442
##         Fair  962  907  1869
##         Good  287  653   940
##    Excellent   69  336   405
##        Total 2994 3307  6301

Comments

  • There is no apparent difference for bad, poor and fair group across by host_is_superhost.

  • But superhost performs much better in good and excellent group

Questons

  • What is the price distribution of host_is_superhost across performance?

6.2.9 instant_bookable and performance

sea_airbnb %>%
  tabyl(instant_bookable, performance) %>%
  adorn_totals(where = c("row", "col"))
##  instant_bookable  Bad Poor Fair Good Excellent Total
##                No  852  709  824  349       142  2876
##               Yes  793  733 1045  591       263  3425
##             Total 1645 1442 1869  940       405  6301

Comments

  • Instant bookable Airbnbs have a better overall performance.

Questons

  • What is the price distribution of instant_bookable across performance?

7 Base EDA Step 4: Multi-variate graphical

7.1 Step 4.1: Categorical

  • Bar graphs with multiple categorical variables
# host_response_time across host_is_superhost
sea_airbnb %>%
  ggplot(mapping = aes(x = host_response_time, fill = host_is_superhost)) +
  geom_bar(position = "dodge")

# instant_bookable across host_is_superhost
sea_airbnb %>%
  ggplot(mapping = aes(x = instant_bookable, fill = host_is_superhost)) +
  geom_bar(position = "dodge")

Comments

  • host_response_time and instant_bookable is closely balanced by host is superhost.
# cancellation_policy across host_is_superhost
sea_airbnb %>%
  ggplot(mapping = aes(x = cancellation_policy, fill = host_is_superhost)) +
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(size = 10, angle = 15))

Comments

  • There appears to be a difference in host_is_superhost across cancellation_policy

Questions

  • Will cancellation policies affect the price?

  • Will cancellation policies affect the weighted_score?

# neighbourhood_group across room_type
sea_airbnb %>%
  ggplot(mapping = aes(x = neighbourhood_group, fill = room_type)) +
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(size = 5, angle = 15))

# neighbourhood_group across instant_bookable
sea_airbnb %>%
  ggplot(mapping = aes(x = neighbourhood_group, fill = instant_bookable)) +
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(size = 5, angle = 15))

# neighbourhood_group across cancellation_policy
sea_airbnb %>%
  ggplot(mapping = aes(x = neighbourhood_group, fill = cancellation_policy)) +
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(size = 5, angle = 15))

Comments

  • There appears to be a difference in neighbourhood_group across room_type, instant_bookable, and cancellation_policy.

Questions

  • Is there any price differences between instant bookable Airbnb and non-instant bookable Airbnb?
# cancellation_policy across instant_bookable
sea_airbnb %>%
  ggplot(mapping = aes(x = cancellation_policy, fill = instant_bookable)) +
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(size = 10, angle = 15))

Comments

  • Instant_bookable is closely balanced by cancellation_policy.
# performance across host_is_superhost
sea_airbnb %>%
  ggplot(mapping = aes(x = performance, fill = host_is_superhost)) +
  geom_bar(position = "dodge")

Comments

  • Confirms that superhost has a better performance than non-superhost
# performance across property_type
sea_airbnb %>%
  ggplot(mapping = aes(x = property_type, fill = performance)) +
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(size = 5, angle = 15))

Comments

  • appears to be a difference in performance across property_type

  • need to take a closer look

Comments

  • Which property type has a better performance?
# performance across instant_bookable
sea_airbnb %>%
  ggplot(mapping = aes(x = performance, fill = instant_bookable)) +
  geom_bar(position = "dodge")

Comments

  • Instant_bookable Airbnbs have better performance in fair, good, and excellent groups.
# performance across cancellation_policy
sea_airbnb %>%
  ggplot(mapping = aes(x = cancellation_policy, fill = performance)) +
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(size = 10, angle = 15))

Comments

  • Relatively flexible or moderate cancellation policies have more possibility to get a better performance.

    • take a further look

7.1.1 Categorical - perforamce/cancellation policy

# more detail on performance and property_type
grid.arrange(
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = property_type, fill = performance)) +
    geom_bar(position = "dodge") +
    coord_flip() +
    theme(axis.text.y = element_text(size = 5)),
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = property_type, fill = performance)) +
    geom_bar(position = "fill") +
    coord_flip() +
    theme(axis.text.y = element_text(size = 5)),
  
  ncol = 1
  
)

Comments

  • Farm stay and Boutique hotel have the best performance overall.

  • Townhouse, Tinyhouse, Loft, House, Guesthouse, Guest suite, Cottage, Condominium, Boat, and Apartment have the similar performance distributions.

# more detail on performance and cancellation policy
grid.arrange(
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = cancellation_policy, fill = performance)) +
    geom_bar(position = "dodge") +
    coord_flip(),
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = cancellation_policy, fill = performance)) +
    geom_bar(position = "fill") +
    coord_flip(),
  
  ncol = 1
  
)

Comments

  • Confirms that flexible, moderate and strict_14_with_grace_period have similar distributions.

7.2 Step 4.2: Quantitative

In this part, I will examine the relationship between different quantitative variables:

- host_response_rate
- accommodates
- bedrooms
- reviews_per_month
- number_of_reviews
- price
- weighted_score
# Distribution of numeric variables
# correlation between price and other variables
sea_airbnb %>%
  inspect_cor(with_col = "price") %>%
  show_plot()

Comments

  • Price has apparent positive correlation with host_total_listings_count, accommodates, bedrooms, beds, and bathrooms.

  • Price has apparent negative correlation with reviews_per_month, number_of_reviews, weighted_score, and review_scores_rating.

Questions

  • Price is negative related with number_of_reviews and reviews_per_month. Does that imply that low price Airbnbs are more welcomed by people?
# correlation between weighted score and other variables
sea_airbnb %>%
  inspect_cor(with_col = "weighted_score") %>%
  show_plot()

Comments

  • weighted_score has apparent positive correlation with host_since, reviews_per_month, number_of_reviews, and review_scores_rating.

  • weighted_score has apparent negative correlation with host_total_listings_count, price, and minimum_nights.

# host_response_rate across weighted score
sea_airbnb %>%
  ggplot(mapping = aes(x = host_response_rate, y = weighted_score)) +
  geom_point()

Comments

  • Some high response rates have low weighted scores.

    • might because few number of reviews or low review scores
  • Didn’t find apparent linear relationship between weighted_score and host_response_rate

# host_response_rate across price
sea_airbnb %>%
  ggplot(mapping = aes(x = host_response_rate, y = price)) +
  geom_point()

Comments

  • Most of the points are clustered on the price lower than $250 and response rate is higher than 0.875.

  • People may prefer a lower price Airbnb.

# price across accommodates
sea_airbnb %>%
ggplot(mapping = aes(x = accommodates, y = price))+geom_violin(alpha=0.5, color="gray")+geom_jitter(alpha=0.5, aes(color=accommodates), position = position_jitter(width = 0.1))

Comments

  • Most of the data points are clustered below $500 and less than 10 accommodates.
# price across bedrooms
sea_airbnb %>%
ggplot(mapping = aes(x = bedrooms, y = price))+geom_violin(alpha=0.5, color="gray")+geom_jitter(alpha=0.5, aes(color=bedrooms), position = position_jitter(width = 0.1))

Comments

  • For each numbers of bedrooms, there are different prices from low to high.
  • In general, the more bedrooms, the price tends to be higher.
# host_total_listings_count across price
sea_airbnb %>%
  ggplot(mapping = aes(x = host_total_listings_count, y = price)) +
  geom_point()

Comments

  • Most of the data points are clustered below $500 and 250 total listing counts.

  • Didn’t find apparent positive distributions between total listings and price.

# reviews_per_month across price
sea_airbnb %>%
  ggplot(mapping = aes(x = reviews_per_month, y = price)) +
  geom_point()

Comments

  • Highest number of reviews per month are showed on price below $250.

    • Confirms that people prefer a relatively low price
# price across weighted_score
sea_airbnb %>%
  ggplot(mapping = aes(x = weighted_score, y = price)) +
  geom_point()

Comments

  • Nearly all high weighted score are clustered on price below $250.

  • Only a few high price Airbnbs have high weighted score.

    • Might because the higher price the higher expectations people will have.
# number_of_reviews across weighted_score
sea_airbnb %>%
  ggplot(mapping = aes(x = number_of_reviews, y = weighted_score)) +
  geom_point()

Comments

  • Confirms that more number of reviews will have more chance to get a higher weighted_score
# reviews_per_month across accommodates
sea_airbnb %>%
ggplot(mapping = aes(x = reviews_per_month, y = accommodates))+geom_violin(alpha=0.5, color="gray")+geom_jitter(alpha=0.5, aes(color=accommodates), position = position_jitter(width = 0.1))

Comments

  • Comfirms that less accommodates (i.e.: less than 5) are more popular.
# reveiws_per_month across bedrooms
sea_airbnb %>%
ggplot(mapping = aes(x = reviews_per_month, y = bedrooms))+geom_violin(alpha=0.5, color="gray")+geom_jitter(alpha=0.5, aes(color=bedrooms), position = position_jitter(width = 0.1))

Comments

  • Confirms that less bedrooms (i.e.: less than 3) are more popular.

7.3 Step 4.3: Categorical and quantitative

sea_airbnb %>%
  ggpairs(columns = c(5,7,15,16,20,24,25,27))

Comments

  • host_is_superhost balanced looking both directions on correlation graph.

  • instant_bookable balanced looking both directions on correlation graph.

Questions

  • Look more closely at:

    • host_is_superhost/price
    • neighbourhood_group/price
    • property_type/price
    • performance/price
    • cancellation_policy/price
    • instant_bookable/price
    • neighbourhood_group/reviews_per_month
    • property_type/reviews_per_month
    • instant_bookable/reviews_per_month
    • cancellation_policy/reviews_per_month
    • host_is_superhost/reviews_per_month
    • host_is_superhost/weighted_score
    • host_response_rate/performance

7.3.1 host_is_superhost/price and neighbourhood_group/price

grid.arrange(
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = host_is_superhost, y = price)) +
    geom_boxplot(),
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = neighbourhood_group, y = price)) +
    geom_boxplot() +
    theme(axis.text.x = element_text(size = 7, angle = 15)),
  
  ncol = 1
)

Comments

  • host_is_superhost vs price

    • superhost price distribution is little lower than non-superhost.
  • neighbourhood_group vs price

    • Downtown and Queen Anne have higher price distributions.
    • Beacon Hill, Delridge, Northgate and University District have relatively low price distributions.

7.3.2 property_type/price and performance/price

grid.arrange(
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = property_type, y = price)) +
    geom_boxplot() +
    theme(axis.text.x = element_text(size = 5, angle = 15)),
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = performance, y = price)) +
    geom_boxplot(),
  
  ncol = 1
)

Comments

  • property_type vs price

    • Boutique hotel has the highest price distribution.
    • Farm stay and Tree house have lowest price distribution.
  • performance vs price

    • Good and Excellent performance have relatively low price distributions.
    • Bad and Poor performance have relatively high price distributions.
      • Further confrims that people prefer relatively low price Airbnb.

7.3.3 cancellation_policy/price and instant_bookable/price

grid.arrange(
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = cancellation_policy, y = price)) +
    geom_boxplot() +
    theme(axis.text.x = element_text(size = 9, angle = 10)),
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = instant_bookable, y = price)) +
    geom_boxplot(),
  
  ncol = 1
)

Comments

  • cancellation_policy vs price

    • super_strict_60 has the highest price distribution.
    • flexible and moderate have the relatively low price distribution.
  • instant_bookable vs price

    • instant bookable price distrbution is a little bit higher than non-instant bookable Airbnbs.

Questions

  • Which cancellation policy is more welcomed?

    • look at the cancellation_policy across reviews_per_month.

7.3.4 cancellation_policy/reviews_per_month and property_type/reviews_per_month

grid.arrange(
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = cancellation_policy, y = reviews_per_month)) +
    geom_boxplot() +
    theme(axis.text.x = element_text(size = 9, angle = 10)),
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = property_type, y = reviews_per_month)) +
    geom_boxplot() +
    theme(axis.text.x = element_text(size = 5, angle = 15)),
  
  ncol = 1
)

Comments

  • cancellation_policy vs reviews_per_month

    • Confirms that flexible and moderate cancellation policies are more welcomed.
  • property_type vs reviews_per_month

    • farm stay is popular among property_types.
    • Bed and breakfast and Tent are less popular.

Questions

  • How about the revenue across different cancellation policies?

  • How about the revenue across different property types?

7.3.5 instant_bookable/reviews_per_month and neighbourhood_grou/reviews_per_month

grid.arrange(
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = instant_bookable, y = reviews_per_month)) +
    geom_boxplot(),
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = neighbourhood_group, y = reviews_per_month)) +
    geom_boxplot() +
    theme(axis.text.x = element_text(size = 7, angle = 15)),
  
  ncol = 1
)

Comments

  • instant_bookable vs reviews_per_month

    • instant bookable Airbnb is more welcomed by people.
  • neighbourhood_group vs reviews_per_month

    • Cascade and University District have lowest review number distributions.
    • Beacon Hill, Capitol Hill, Central Area, Delridge, and Queen Anne have relatively higher monthly review number distributions.

7.3.6 host_is_superhost/reviews_per_month and host_is_superhost/weighted_score

grid.arrange(
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = host_is_superhost, y = reviews_per_month)) +
    geom_boxplot(),
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = host_is_superhost, y = weighted_score)) +
    geom_boxplot(),
  
  ncol = 1
)

Comments

  • host_is_superhost vs reviews_per_month

    • Airbnb of superhost is more popular.
  • host_is_superhost vs weighted_score

    • superhost’s weighted score is higher than non-superhost.

7.3.7 host_response_rate across performance

sea_airbnb %>%
  ggplot(mapping = aes(x = performance, y = host_response_rate)) +
  geom_boxplot()

Comments

  • host_response_rate vs performance

    • Response rate of most Good and Excellent performance hosts are high.

7.3.8 annual_revenue/cancellation_policy and annual_revenue/neighbourhood_group

# mutate annual_revenue column
# for simplicity, I assume reviews per month will be the days Airbnb booked every month.And I used monthly review numbers*12 to get the yearly booking days.
sea_airbnb$annual_rev <- sea_airbnb$price*sea_airbnb$reviews_per_month*12
grid.arrange(
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = cancellation_policy, y = annual_rev)) +
    geom_boxplot() +
    theme(axis.text.x = element_text(size = 9, angle = 10)),
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = neighbourhood_group, y = annual_rev)) +
    geom_boxplot() +
    theme(axis.text.x = element_text(size = 7, angle = 12)),
  
  ncol = 1
)

Comments

  • cancellation_policy vs annual_revenue

    • flexible, moderate and strict_14_with_grace_period have relatively higher annual revenue distributions.
  • neighbourhood_group vs annual_revenue

    • Capitol Hill, Downtown, and Queen Anne have relatively higher annual revenue distributions.

7.3.9 annual_revenue/host_is_superhost and annual_revenue/instant_bookable

grid.arrange(
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = host_is_superhost, y = annual_rev)) +
    geom_boxplot(),
  
  sea_airbnb %>%
    ggplot(mapping = aes(x = instant_bookable, y = annual_rev)) +
    geom_boxplot(),
  
  ncol = 1
)

Comments

  • host_is_superhost vs annual_revenue

    • superhost has higher annual revenue distributions.
  • instant_bookable vs annual_revenue

    • instant_bookable has higher annual revenue distributions.

7.3.10 annual_revenue/property_type

sea_airbnb %>%
  ggplot(mapping = aes(x = property_type, y = annual_rev)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(size = 5, angle = 13))

Comments

  • Apartment and Condominium have similar annual revenue distributions.

  • House has more chance to earn a high revenue.

  • Even if Farm stay is popular, but the annual revenue is not so high.

  • Tree house has the lowest annual revenue.

8 Detailed EDA - questions raised in Base EDA and other new questions

8.1 What is the annual_revenue distribution of host_is_superhost across performance?

# annual_rev/host_is_superhost/performance
sea_airbnb %>%
  group_by(host_is_superhost, performance) %>%
  summarise(med_revenue = median(annual_rev)) %>%
  ggplot(mapping = aes(x = performance, y = med_revenue, color = host_is_superhost, group = host_is_superhost)) +
  geom_line(aes(color = host_is_superhost)) +
  geom_point() +
  labs(x = "Performance", y = "Median Revenue", color = "Host is superhost") +
  theme_classic() +
  #scale_color_manual(values = c("#969696", "red"), labels = c("No", "Yes"), guide = guide_legend(reverse = TRUE)) +
  scale_y_continuous(label = dollar)

Comments

  • For each performance level, superhost can earn more revenue than non-superhost.

    • would recommend Airbnb host to be the superhost.

8.2 What is the distribution of annual_revenue and neighbourhood_group across instant bookable?

sea_airbnb %>%
  group_by(neighbourhood_group, instant_bookable) %>%
  summarise(med_revenue = median(annual_rev)) %>%
  ggplot(mapping = aes(x = neighbourhood_group, y = med_revenue, color = instant_bookable, group = instant_bookable)) +
  geom_line(aes(color = instant_bookable)) +
  geom_point() +
  coord_flip() +
  labs(x = "Neighbourhood Group", y = "Median Revenue", color = "Instant Bookable") +
  theme_classic() +
  #theme(axis.text.x = element_text(size = 5, angle = 13)) +
  #scale_color_manual(values = c("#969696", "red"), labels = c("No", "Yes"), guide = guide_legend(reverse = TRUE)) +
  scale_y_continuous(label = dollar)

Comments

  • Airbnb in Capitol Hill, Central Area, Magnolia, and Queen Anne can get the highest median revenue.
  • In each neighbourhood group, instant bookable Airbnb can get higher median revenus.

8.3 What is the annual_revenue distribution of cancellation_policy across host_is_superhost?

sea_airbnb %>%
  group_by(cancellation_policy, host_is_superhost) %>%
  summarise(med_revenue = median(annual_rev)) %>%
  ggplot(mapping = aes(x = cancellation_policy, y = med_revenue, color = host_is_superhost, group = host_is_superhost)) +
  geom_line(aes(color = host_is_superhost)) +
  geom_point() +
  coord_flip() +
  labs(x = "Cancellation Policy", y = "Median Revenue", color = "Host is superhost") +
  theme_classic() +
  #scale_color_manual(values = c("#969696", "red"), labels = c("No", "Yes"), guide = guide_legend(reverse = TRUE)) +
  scale_y_continuous(label = dollar)

Comments

  • For most of the cancellation policy, superhost can earn more median revenue than non-super host.

8.4 What is the distribution of property_type and median_price across host_is_superhost?

sea_airbnb %>%
  group_by(property_type, host_is_superhost) %>%
  summarise(med_price = median(price)) %>%
  ggplot(mapping = aes(x = property_type, y = med_price, color = host_is_superhost, group = host_is_superhost)) +
  geom_line(aes(color = host_is_superhost)) +
  geom_point() +
  coord_flip() +
  labs(x = "Property type", y = "Median Price", color = "Host is superhost") +
  theme_classic() +
  #scale_color_manual(values = c("#969696", "red"), labels = c("No", "Yes"), guide = guide_legend(reverse = TRUE)) +
  scale_y_continuous(label = dollar)

Comments

  • For both superhost and non-superhost, Boutique hotel has the median price.
sea_airbnb %>%
  group_by(neighbourhood_group, host_is_superhost) %>%
  summarise(med_revenue = median(annual_rev)) %>%
  ggplot(mapping = aes(x = neighbourhood_group, y = med_revenue, color = host_is_superhost, group = host_is_superhost)) +
  geom_line(aes(color = host_is_superhost)) +
  geom_point() +
  coord_flip() +
  labs(x = "Neighbourhood Group", y = "Median Revenue", color = "Host is superhost") +
  theme_classic() +
  #scale_color_manual(values = c("#969696", "red"), labels = c("No", "Yes"), guide = guide_legend(reverse = TRUE)) +
  scale_y_continuous(label = dollar)

Comments

  • No matter hosts in which area, being a superhost can earn more median revenue per year.
  • For superhosts, they can earn the highest revenue in Queen Anne neighbourhood.
  • For nonsuperhosts, they earn maximum median revenue in Central Area neighbourhood.

8.5 What is the distribution of median_price and neighbourhood_group across property_type?

# select the top 3 popular property type
top_3_property_type <- sea_airbnb %>%
  group_by(property_type) %>%
  summarise(count_n = n()) %>%
  arrange(desc(count_n)) %>%
  slice(1:3)

sea_airbnb_prpt_t3 <- inner_join(sea_airbnb, top_3_property_type, by = "property_type")

sea_airbnb_prpt_t3 %>%
  group_by(neighbourhood_group, property_type) %>%
  summarise(med_price = median(price)) %>%
  ggplot(mapping = aes(x = neighbourhood_group, y = med_price, color = property_type, group = property_type)) +
  geom_line(aes(color = property_type)) +
  geom_point() +
  coord_flip() +
  labs(x = "Neighbourhood Group", y = "Median Price", color = "Property Type") +
  theme_classic() +
  scale_y_continuous(label = dollar)

Comments

  • Showing all property types will be overwhelming, so I only picked the top 3 popular property type.
  • Apartment has the highest median price in Downtown.
  • Guest suite has the highest median price in Interbay.
  • House has the highest median price in Queen Anne.
  • The median price of all property types are lower than $200.

8.6 What is the distribution of median_price and reviews_per_month across instant_bookable

sea_airbnb %>%
  group_by(instant_bookable, reviews_per_month) %>%
  summarise(med_price = median(price)) %>%
  ggplot(mapping = aes(x = reviews_per_month, y = med_price, color = instant_bookable)) +
  geom_point() + 
  facet_grid(. ~ instant_bookable) +
  theme_bw() +
  theme(axis.text.x = element_text(face = "bold", size = 10, angle = 0),
        legend.position = "bottom",
        plot.title = element_text(hjust = 0.5, face = "bold"),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank()) +
  labs(x = "Reviews Per Month", y = "Median Price", color = "Instant Bookable") +
  scale_y_continuous(labels = dollar)

Comments

  • Instant_bookable Airbnb has more high number of monthly reviews.

    • It seems that instant bookable Airbnb is more popular.
  • Instant_bookable Airbnb has more high price data point.

8.7 What is the distribution of performance and median price across instant_bookable?

sea_airbnb %>%
  group_by(performance, instant_bookable) %>%
  summarise(med_price = median(price)) %>%
  ggplot(mapping = aes(x = performance, y = med_price, color = instant_bookable, group = instant_bookable)) +
  geom_line(aes(color = instant_bookable)) +
  geom_point() +
  labs(x = "Performance", y = "Median Price", color = "Instant Bookable") +
  theme_classic() +
  scale_y_continuous(label = dollar)

Comments

  • The median prices of Fair, Good and Excellent performance are all lower than $140.

    • implies that people may feel more statisfied with a relatively lower price Airbnb.
  • The price of instant bookable across bad performance has the highest median price.

9 Statistical Analysis

Two main findings

  • Being superhost has the higher revenue than non-superhost.
  • Instant bookable Airbnb can earn more revenue than non-instant bookable Airbnb.

Establish a hypothesis

  • Finding 1: Being superhost can increase revenue.

    • Null hypothesis: Being superhost has no affect on revenue.
    • Alternate hypothesis: Being superhost has affect on revenue.
  • Finding 2: Instant bookable can increase revenue.

    • Null hypothesis: Instant bookable has no affect on revenue.

      • Instant bookable Airbnb in different neighbourhood has no affects on revenue.
    • Alternate hypothesis: Instant bookable has affect on revenue.

      • Instant bookable Airbnb in different neighbouthood has affects on revenue.

Set confidence interval

  • Confidence level = 95%

9.1 Test first finding:

Superhost and Revenue

  • Null hypothesis: Being superhost has no affect on revenue.
  • Alternate hypothesis: Being superhost has affect on revenue.

9.1.1 Visualize host_is_superhost and Annual_Revenue

sea_airbnb %>%
  ggplot(mapping = aes(x = host_is_superhost, y = annual_rev)) +
  geom_boxplot()

According to the boxplot graph, host_is_superhost has some affect on annual revenue.

9.1.2 t-test

t.test(sea_airbnb$annual_rev[sea_airbnb$host_is_superhost == "Yes"], sea_airbnb$annual_rev[sea_airbnb$host_is_superhost == "No"], conf.level = 0.95)
## 
##  Welch Two Sample t-test
## 
## data:  sea_airbnb$annual_rev[sea_airbnb$host_is_superhost == "Yes"] and sea_airbnb$annual_rev[sea_airbnb$host_is_superhost == "No"]
## t = 12.802, df = 6280.9, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1199.990 1633.962
## sample estimates:
## mean of x mean of y 
##  5054.503  3637.527

My null hypothesis is superhost has no affect on revenue. According to the t-test, p-value is near to zero with the confidence interval of 0.95, which indicates that we can reject the null hypothesis that superhost has no affect on revenue.

9.1.3 Statistical Visual

# Median revenue and host_is_superhost
# 95% CI, get z-value for two tails
z <- qnorm(0.95)

sea_airbnb %>%
  group_by(host_is_superhost) %>%
  summarise(med_revenue = median(annual_rev), sd = sd(annual_rev), n = n(), ci = z * sd/sqrt(n)) %>%
  ggplot(aes(x = host_is_superhost, y = med_revenue)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_errorbar(aes(ymin = med_revenue - ci, ymax = med_revenue + ci), width = 0.5, position = position_dodge(0.9))

According to the error bars, we can find that the spread of data is not so much based on our confidence interval. To be specific, the lower line of host_is_superhost “Yes” error bar does not overlap with the host_is_superhost “No” error bar, which means the data of host_is_superhost “Yes” is different from the data of host_is_superhost “No”. So we can confirm to reject our null hypothsis that superhost has no affect on median revenue.

9.2 Test second finding: Instant_bookable and Revenue

sea_airbnb %>%
  group_by(instant_bookable, neighbourhood_group) %>%
  ggplot(mapping =aes(x = instant_bookable, y = annual_rev, fill = neighbourhood_group)) +
  geom_boxplot() +
  labs(x = "Instant bookable", y = "Annual Revenue", fill = "Neighbourhood Group")

According to the boxplot, we can find that for same neighbourhood group of Airbnb, different booking policy can bring different revenues. This may imply that the combination of instant_bookable and neighbourhood_group have affect on revenue.

9.2.1 Multinomial Analysis

# make a table of counts to calculate the confidence interval
A_F_n <- sea_airbnb %>%
  group_by(instant_bookable, neighbourhood_group) %>%
  summarise(n = n())

# Calculate confidence intervals using mulitnomialCI
A_F_n_ci <- multinomialCI(t(A_F_n[ ,3]), 0.05)

# Create a table with proportions
A_F_tab <- sea_airbnb %>%
  group_by(instant_bookable, neighbourhood_group) %>%
  summarise(prop = round(n()/sum(nrow(sea_airbnb)), 3))

# Add the confidence intervals to the table of proportions
A_F_tab$ci_l <- round(A_F_n_ci[ , 1], 3)
A_F_tab$ci_u <- round(A_F_n_ci[ , 2], 3)

# Show the table
formattable(A_F_tab)
instant_bookable neighbourhood_group prop ci_l ci_u
No Ballard 0.027 0.018 0.037
No Beacon Hill 0.017 0.008 0.026
No Capitol Hill 0.046 0.037 0.056
No Cascade 0.011 0.001 0.020
No Central Area 0.043 0.033 0.052
No Delridge 0.017 0.008 0.027
No Downtown 0.043 0.034 0.052
No Interbay 0.001 0.000 0.010
No Lake City 0.011 0.002 0.020
No Magnolia 0.010 0.001 0.019
No Northgate 0.013 0.004 0.022
No Other neighborhoods 0.106 0.096 0.115
No Queen Anne 0.032 0.023 0.041
No Rainier Valley 0.028 0.019 0.038
No Seward Park 0.007 0.000 0.017
No University District 0.015 0.006 0.025
No West Seattle 0.030 0.020 0.039
Yes Ballard 0.026 0.016 0.035
Yes Beacon Hill 0.020 0.011 0.030
Yes Capitol Hill 0.055 0.046 0.065
Yes Cascade 0.026 0.017 0.035
Yes Central Area 0.049 0.039 0.058
Yes Delridge 0.015 0.006 0.025
Yes Downtown 0.130 0.121 0.140
Yes Interbay 0.001 0.000 0.011
Yes Lake City 0.008 0.000 0.017
Yes Magnolia 0.008 0.000 0.018
Yes Northgate 0.012 0.003 0.022
Yes Other neighborhoods 0.084 0.075 0.093
Yes Queen Anne 0.039 0.030 0.049
Yes Rainier Valley 0.026 0.017 0.036
Yes Seward Park 0.004 0.000 0.014
Yes University District 0.011 0.002 0.021
Yes West Seattle 0.028 0.019 0.037

9.2.2 Statistical Visual

A_F_tab %>%
  ggplot(aes(x = instant_bookable, y = prop, fill = neighbourhood_group)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = round(prop, 2)), vjust = -1.5, color = "black",
            position = position_dodge(0.9), size = 4) +
  geom_errorbar(aes(ymin = ci_l, ymax = ci_u),
                width = 0.4, position = position_dodge(0.9)) +
  labs(x = "Instant Bookable", y = "Proportion", fill = "Neighbourhood group")

Based on this sample:

  • Several proportions are reliably greater than zero
  • Downtown has the most reliably

9.3 Multiple linear regression

# Logit regression with general linear model
mod <- glm(annual_rev ~ host_is_superhost + instant_bookable + neighbourhood_group, data = sea_airbnb)
summary(mod)
## 
## Call:
## glm(formula = annual_rev ~ host_is_superhost + instant_bookable + 
##     neighbourhood_group, data = sea_airbnb)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -7234   -2678    -984    1467   58562  
## 
## Coefficients:
##                                        Estimate Std. Error t value
## (Intercept)                              2795.4      248.3  11.258
## host_is_superhostYes                     1644.3      109.6  14.998
## instant_bookableYes                      1501.4      109.9  13.656
## neighbourhood_groupBeacon Hill            154.0      359.9   0.428
## neighbourhood_groupCapitol Hill           953.8      285.8   3.338
## neighbourhood_groupCascade               -411.8      362.6  -1.136
## neighbourhood_groupCentral Area           526.9      291.0   1.811
## neighbourhood_groupDelridge             -1008.7      374.4  -2.694
## neighbourhood_groupDowntown               557.7      267.9   2.081
## neighbourhood_groupInterbay              -362.3     1241.2  -0.292
## neighbourhood_groupLake City            -1841.9      455.7  -4.042
## neighbourhood_groupMagnolia              -237.1      458.7  -0.517
## neighbourhood_groupNorthgate            -1637.3      407.3  -4.019
## neighbourhood_groupOther neighborhoods   -851.3      261.8  -3.252
## neighbourhood_groupQueen Anne            1422.5      305.6   4.655
## neighbourhood_groupRainier Valley       -1045.6      324.8  -3.219
## neighbourhood_groupSeward Park          -1762.8      546.0  -3.228
## neighbourhood_groupUniversity District  -1993.5      401.5  -4.965
## neighbourhood_groupWest Seattle          -370.8      320.7  -1.156
##                                                    Pr(>|t|)    
## (Intercept)                            < 0.0000000000000002 ***
## host_is_superhostYes                   < 0.0000000000000002 ***
## instant_bookableYes                    < 0.0000000000000002 ***
## neighbourhood_groupBeacon Hill                      0.66876    
## neighbourhood_groupCapitol Hill                     0.00085 ***
## neighbourhood_groupCascade                          0.25615    
## neighbourhood_groupCentral Area                     0.07026 .  
## neighbourhood_groupDelridge                         0.00708 ** 
## neighbourhood_groupDowntown                         0.03744 *  
## neighbourhood_groupInterbay                         0.77040    
## neighbourhood_groupLake City                    0.000053683 ***
## neighbourhood_groupMagnolia                         0.60525    
## neighbourhood_groupNorthgate                    0.000059004 ***
## neighbourhood_groupOther neighborhoods              0.00115 ** 
## neighbourhood_groupQueen Anne                   0.000003314 ***
## neighbourhood_groupRainier Valley                   0.00129 ** 
## neighbourhood_groupSeward Park                      0.00125 ** 
## neighbourhood_groupUniversity District          0.000000704 ***
## neighbourhood_groupWest Seattle                     0.24765    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 17842552)
## 
##     Null deviance: 124977025806  on 6300  degrees of freedom
## Residual deviance: 112086913946  on 6282  degrees of freedom
## AIC: 123111
## 
## Number of Fisher Scoring iterations: 2
# plot residuals to check for patterns
par(mfrow = c(1, 1))
plot(sea_airbnb$annual_rev, mod$residuals)

par(mfrow = c(1, 1))
plot(sea_airbnb$price, mod$residuals)

plot(sea_airbnb$host_is_superhost, mod$residuals)

plot(sea_airbnb$instant_bookable, mod$residuals)

plot(sea_airbnb$neighbourhood_group, mod$residuals)

9.4 Visualization of Multiple Regression

# Pull out the coefficients and confidence interval for table and graph
coe <- summary(mod)$coefficients
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(mod)[-1, ]))

# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval", "low_CI", "high_CI")

htmlTable(round(coe_CI[order(coe_CI$pval, decreasing = FALSE), ], 1))
estimate se t pval low_CI high_CI
host_is_superhostYes 1644.3 109.6 15 0 1429.4 1859.1
instant_bookableYes 1501.4 109.9 13.7 0 1285.9 1716.8
neighbourhood_groupUniversity District -1993.5 401.5 -5 0 -2780.3 -1206.6
neighbourhood_groupQueen Anne 1422.5 305.6 4.7 0 823.5 2021.5
neighbourhood_groupLake City -1841.9 455.7 -4 0 -2735.1 -948.7
neighbourhood_groupNorthgate -1637.3 407.3 -4 0 -2435.6 -838.9
neighbourhood_groupCapitol Hill 953.8 285.8 3.3 0 393.7 1513.9
neighbourhood_groupOther neighborhoods -851.3 261.8 -3.3 0 -1364.4 -338.2
neighbourhood_groupSeward Park -1762.8 546 -3.2 0 -2832.9 -692.6
neighbourhood_groupRainier Valley -1045.6 324.8 -3.2 0 -1682.2 -409
neighbourhood_groupDelridge -1008.7 374.4 -2.7 0 -1742.6 -274.9
neighbourhood_groupDowntown 557.7 267.9 2.1 0 32.5 1082.8
neighbourhood_groupCentral Area 526.9 291 1.8 0.1 -43.5 1097.3
neighbourhood_groupWest Seattle -370.8 320.7 -1.2 0.2 -999.5 257.8
neighbourhood_groupCascade -411.8 362.6 -1.1 0.3 -1122.4 298.9
neighbourhood_groupMagnolia -237.1 458.7 -0.5 0.6 -1136.2 662
neighbourhood_groupBeacon Hill 154 359.9 0.4 0.7 -551.5 859.5
neighbourhood_groupInterbay -362.3 1241.2 -0.3 0.8 -2795 2070.5
g1 <- ggplot(coe_CI, aes(x = estimate, y = reorder(row.names(coe_CI),desc(pval)))) +
  geom_point(size = 3) +
  xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
  ylab("Variable") +
  xlab("Coefficient") +
  theme_bw() 


g2 <- g1 +
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))), 
               xend = coe_CI$high_CI, color = "Blue") +
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))), 
               xend = coe_CI$low_CI, color = "Blue") +
   xlab("Coefficient with Confidence Interval")

g3 <- g2 +
  geom_vline(xintercept = 0, color = "red")
g3

Comments

  • host_is_superhostYes, instant_bookableYes, neighbourhood_groupUniversity District, neighbourhood_groupQueen Anne, neighbourhood_groupLake City are statistically far from zero, which indicates they have larger impact on revenue and are more statistically significant.
  • The coefficients of neighbourhood_groupWest Seattle, neighbourhood_groupCascade, neighbourhood_groupMagolia, neighbourhood_groupBeacon Hill and neighbourhood_groupInterbay are statistically close to zero, which indicates that they have little impact on revenue and are less statistically significant.

    • can confirm that superhost has affect on revenue.
    • can confirm that instant_bookable and some of the neighbourhood group have affect on revenue.

10 Summary

In summary, I would conclude that being the superhost, neighbourhood group, instant bookable have affect on revenue.

- Revenue is higher for superhost.
- Revenue is higher for instant bookable Airbnb.
- Revenue is higher in some specific neighbourhood groups.

11 Create professional quality visual

11.1 annual_revenue/host_is_superhost/performance

rev_suphost_perform <- sea_airbnb %>%
  group_by(host_is_superhost, performance) %>%
  summarise(med_revenue = median(annual_rev)) %>%
  ggplot(mapping = aes(x = performance, y = med_revenue, color = host_is_superhost, group = host_is_superhost)) +
  geom_line(aes(color = host_is_superhost)) +
  geom_point() +
  geom_hline(yintercept = 6050, linetype = 2, color = "black") +
  ggtitle("Being Superhost Can Earn More Revenue Per Year", subtitle = "Median revenue of superhost is higher than non-superhost") +
  labs(x = "Performance", y = "Median Revenue", color = "Host is superhost") +
  theme_classic() +
  theme(plot.title = element_text(face = "bold"),
        axis.ticks.x = element_blank(),
        axis.text.x = element_text(face = "bold"),
        legend.position = c(0.85, 0.17)) +
  scale_color_manual(values = c("#969696", "red"), labels = c("No", "Yes"), guide = guide_legend(reverse = TRUE)) +
  scale_y_continuous(label = dollar)

# show the graph
rev_suphost_perform

11.2 annual_revenue/neighbourhood_group/instant bookable

rev_neighbour_book <- sea_airbnb %>%
  group_by(neighbourhood_group, instant_bookable) %>%
  summarise(med_revenue = median(annual_rev)) %>%
  ggplot(mapping = aes(x = neighbourhood_group, y = med_revenue, color = instant_bookable, group = instant_bookable)) +
  geom_line(aes(color = instant_bookable)) +
  geom_point() +
  ggtitle("Making Your Home Instant Bookable", subtitle = "Revenue of Instant Bookable Airbnb home is higher") +
  coord_flip() +
  labs(x = "Neighbourhood Group", y = "Median Revenue", color = "Instant Bookable") +
  annotate("text", x = 16.5, y = 5150, label = "High revenue range", color = "black", size = 3.5) +
  geom_hline(yintercept = 6200, linetype = 2, color = "black", alpha = 0.5) +
  geom_hline(yintercept = 4000, linetype = 2, color = "black", alpha = 0.5) +
  theme_classic() +
  theme(plot.title = element_text(face = "bold"),
        axis.ticks.x = element_blank(),
        axis.line.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank(),
        axis.text.x = element_text(face = "bold"),
        legend.position = "bottom") +
  scale_color_manual(values = c("#969696", "red"), labels = c("No", "Yes"), guide = guide_legend(reverse = TRUE)) +
  scale_y_continuous(label = dollar)

# show the graph
rev_neighbour_book

11.3 reviews_per_month/price

sea_airbnb$high_monthly_reviews[sea_airbnb$reviews_per_month >= 10]  = "high"
sea_airbnb$high_monthly_reviews[sea_airbnb$reviews_per_month < 10] = "low"
sea_airbnb$high_monthly_reviews = factor(sea_airbnb$high_monthly_reviews, levels=c("low", "high"))

review_price <- sea_airbnb %>%
  group_by(high_monthly_reviews) %>%
  ggplot(mapping = aes(x = reviews_per_month, y = price, color = high_monthly_reviews)) +
  geom_point(alpha = 0.8) +
  ggtitle("What Price Range Is More Welcomed??", subtitle = "Home price lower than $250") +
  labs(x = "Reviews Per Month", y = "Price") +
  geom_hline(yintercept = 250, linetype = 2, color = "black") +
  #annotate("text", x = 16.5, y = 5150, label = "High revenue range", color = "black", size = 3.5)
  scale_color_manual(values=c("#999999", "red")) +
  theme_classic() +
  theme(plot.title = element_text(face = "bold"),
        axis.ticks.x = element_blank(),
        axis.line.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank(),
        legend.position = "none") +
  scale_y_continuous(label = dollar)

# show the graph
review_price

11.4 Price/Neighbourhood group

price_neighbour <- sea_airbnb %>%
  group_by(neighbourhood_group) %>%
  summarise(med_price = median(price)) %>%
  ggplot(mapping = aes(x = neighbourhood_group, y = med_price)) +
  stat_summary(fun.y=median,geom="line",lwd=0.6,aes(group=1)) +
  coord_flip() +
  ggtitle("Which Neighbourhood Groups Have High Home Price??", subtitle = "Downtown, Cascade and Queen Anne have highest median price") +
  labs(x = "Neighbourhood Group", y = "Median Price") +
  geom_vline(xintercept = 13, linetype = 2, color = "red", alpha = 0.75) +
  geom_vline(xintercept = 7, linetype = 2, color = "red", alpha = 0.75) +
  geom_vline(xintercept = 4, linetype = 2, color = "red", alpha = 0.75) +
  theme_classic() +
  theme(plot.title = element_text(face = "bold"),
        axis.ticks.x = element_blank(),
        axis.line.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank(),
        legend.position = "none") +
  scale_y_continuous(label = dollar)

# show the graph
price_neighbour

12 Save visuals for use in other documents

ggsave(filename = "rev_suphost_perform.png", plot = rev_suphost_perform)
## Saving 7 x 5 in image
ggsave(filename = "rev_neighbour_book.png", plot = rev_neighbour_book)
## Saving 7 x 5 in image
ggsave(filename = "review_price.png", plot = review_price)
## Saving 7 x 5 in image
ggsave(filename = "price_neighbour.png", plot = price_neighbour)
## Saving 7 x 5 in image